import re
import urllib.parse
from bs4 import BeautifulSoup


class HtmlParser(object):

    def _get_new_urls(self, url, soup):
        new_urls = set()
        links = soup.find_all('a', href=re.compile(r'/item'))
        for link in links:
            new_url = link['href']
            new_full_url = urllib.parse.urljoin(url,new_url)
            new_urls.add(new_full_url)
        return new_urls

    def _get_new_data(self, url, soup):

        result = dict()

        result['url'] = url

        title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
        result['title'] = title_node.get_text()

        summary_node = soup.find('div', class_='lemma-summary')
        if summary_node is None:
            return
        result['summary'] = summary_node.get_text()

        return result

    def parser(self, url, content):
        if url is None or content is None:
            return

        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
        new_urls = self._get_new_urls(url, soup)
        new_data = self._get_new_data(url, soup)
        return new_urls, new_data


